The Vehicle Sales and Market Trends Dataset provides a comprehensive collection of information pertaining to the sales transactions of various vehicles.
The objective of this analysis is to explore sales distribution across different states.
The objectives are clear, the questions to abroad are the following:
A short preview of the data frame.
head(cars)
## # A tibble: 6 × 16
## year make model trim body transmission vin state condition odometer
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2015 Kia Sorento LX SUV automatic 5xyk… ca 5 16639
## 2 2015 Kia Sorento LX SUV automatic 5xyk… ca 5 9393
## 3 2014 BMW 3 Series 328i… Sedan automatic wba3… ca 45 1331
## 4 2015 Volvo S60 T5 Sedan automatic yv16… ca 41 14282
## 5 2014 BMW 6 Series… 650i Sedan automatic wba6… ca 43 2641
## 6 2015 Nissan Altima 2.5 S Sedan automatic 1n4a… ca 1 5554
## # ℹ 6 more variables: color <chr>, interior <chr>, seller <chr>, mmr <dbl>,
## # sellingprice <dbl>, saledate <chr>
Missing values, minimums, maximums, means and percentiles and more…
skim_without_charts(cars)
| Name | cars |
| Number of rows | 558837 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| make | 10301 | 0.98 | 2 | 13 | 0 | 96 | 0 |
| model | 10399 | 0.98 | 1 | 29 | 0 | 973 | 0 |
| trim | 10651 | 0.98 | 1 | 46 | 0 | 1963 | 0 |
| body | 13195 | 0.98 | 3 | 23 | 0 | 87 | 0 |
| transmission | 65352 | 0.88 | 5 | 9 | 0 | 4 | 0 |
| vin | 4 | 1.00 | 9 | 17 | 0 | 550297 | 0 |
| state | 0 | 1.00 | 2 | 17 | 0 | 64 | 0 |
| color | 749 | 1.00 | 1 | 9 | 0 | 46 | 0 |
| interior | 749 | 1.00 | 1 | 9 | 0 | 17 | 0 |
| seller | 0 | 1.00 | 3 | 50 | 0 | 14263 | 0 |
| saledate | 12 | 1.00 | 4 | 39 | 0 | 3766 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2010.04 | 3.97 | 1982 | 2007 | 2012 | 2013 | 2015 |
| condition | 11820 | 0.98 | 30.67 | 13.40 | 1 | 23 | 35 | 42 | 49 |
| odometer | 94 | 1.00 | 68320.02 | 53398.54 | 1 | 28371 | 52254 | 99109 | 999999 |
| mmr | 38 | 1.00 | 13769.38 | 9679.97 | 25 | 7100 | 12250 | 18300 | 182000 |
| sellingprice | 12 | 1.00 | 13611.36 | 9749.50 | 1 | 6900 | 12100 | 18200 | 230000 |
Column names
colnames(cars)
## [1] "year" "make" "model" "trim" "body"
## [6] "transmission" "vin" "state" "condition" "odometer"
## [11] "color" "interior" "seller" "mmr" "sellingprice"
## [16] "saledate"
Deleting observations that contains states with incorrect values, such as ‘3vwd17aj5fm297123’. And observations with empty values on the columns: seller, sellingprice and model.
cars2 <- cars %>%
filter(nchar(state) <= 2) %>%
drop_na(saledate, seller, sellingprice, model)
nrow(cars) #Observations before
## [1] 558837
nrow(cars2) #Observations after
## [1] 548400
Calculate the number of sold cars in each state.
table(cars2$state)
##
## ab al az ca co fl ga hi il in la ma md
## 801 25 8575 71658 7661 81480 34056 1222 23188 4273 2164 6525 10841
## mi mn mo ms nc ne nj nm ns nv ny oh ok
## 15287 9301 15798 1814 21333 3954 27331 164 46 12453 5609 21209 69
## on or pa pr qc sc tn tx ut va wa wi
## 3008 1136 53154 2673 1108 4146 20691 45195 1798 11694 7256 9704
Visualizing the distribution of sales per state.
ggplot(cars2, aes(x = reorder(state, table(state)[state]))) +
geom_bar() +
labs(title = "Number of Cars Sold by State", x = "State", y = "Number of Cars Sold")
Answer = The top 5 states with highest sales are: Fl, Ca, Pa, Tx and Ga. On the other hand, the top 5 states with lowest sales are: Al, Ns, Ok, Nm and Ab.
Find the average selling price for cars in each state.
avg_price_by_state <- aggregate(sellingprice ~ state, data = cars2, FUN = mean)
avg <- avg_price_by_state[order(avg_price_by_state$sellingprice, decreasing=TRUE),]
kable(avg, caption = "Average selling price by state")
| state | sellingprice | |
|---|---|---|
| 27 | on | 18232.231 |
| 33 | tn | 17069.398 |
| 29 | pa | 16038.392 |
| 5 | co | 15985.261 |
| 23 | nv | 15152.356 |
| 14 | mi | 14982.873 |
| 9 | il | 14852.803 |
| 16 | mo | 14557.157 |
| 4 | ca | 14540.312 |
| 25 | oh | 14380.163 |
| 37 | wa | 14077.437 |
| 38 | wi | 14054.818 |
| 31 | qc | 14003.339 |
| 6 | fl | 13976.670 |
| 15 | mn | 13789.779 |
| 20 | nj | 13626.459 |
| 34 | tx | 13296.173 |
| 7 | ga | 12990.106 |
| 19 | ne | 12681.879 |
| 8 | hi | 12296.154 |
| 35 | ut | 12227.997 |
| 1 | ab | 11793.009 |
| 28 | or | 11667.738 |
| 3 | az | 11627.498 |
| 24 | ny | 11479.330 |
| 17 | ms | 11364.407 |
| 10 | in | 10793.289 |
| 11 | la | 10592.970 |
| 32 | sc | 10429.096 |
| 22 | ns | 10041.304 |
| 30 | pr | 9528.077 |
| 2 | al | 9440.000 |
| 18 | nc | 8744.494 |
| 36 | va | 8684.673 |
| 13 | md | 7504.937 |
| 26 | ok | 6978.261 |
| 12 | ma | 6789.080 |
| 21 | nm | 6442.683 |
Compare prices across states.
#Setting the plot for the average selling price for cars in each state.
avg_plot <- ggplot(avg_price_by_state, aes(x = reorder(state, sellingprice), y = sellingprice,
text = paste("State: ", state, "<br>",
"Average Price: $", round(sellingprice, 2)))) +
geom_bar(stat = "identity") +
labs(title = "Average Selling Price by State", x = "State", y = "Average Selling Price") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
interactive_plot <- ggplotly(avg_plot, tooltip = "text")
# Disable zoom, pan, and other controls
interactive_plot <- interactive_plot %>%
config(displayModeBar = FALSE,
scrollZoom = FALSE,
showTips = FALSE)
# Display the customized interactive plot
interactive_plot
Answer = The state with the highest average selling price is On with $18,232 and the state with the lowest average selling price is Nm with $6,442.
Identify the most popular car makes and models in each state.
# Select the top 6 states by total sales
top_states <- cars2 %>%
group_by(state) %>%
summarise(total_sales = n()) %>%
top_n(6, total_sales) %>%
pull(state)
# Select the top 10 makes by total sales
top_makes <- cars2 %>%
group_by(make) %>%
summarise(total_sales = n()) %>%
top_n(10, total_sales) %>%
pull(make)
# Filter the dataset to include only the top states and top makes
sales <- cars2 %>%
filter(state %in% top_states, make %in% top_makes) %>%
group_by(make, state) %>%
summarise(counter = n()) %>%
ungroup()
Visualizing the top 10 car makes and models for the top 6 states
# Plotting with vertical bars
sales %>%
mutate(make_fact = make,
make = reorder_within(make, counter, state)) %>%
ggplot(aes(make, counter, fill = make_fact)) +
geom_col(show.legend = FALSE) +
scale_fill_brewer(palette = "Set3") +
scale_x_reordered() +
facet_wrap(~state, scales = "free_x") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Heat-map limited to just the top 20 car makes and states, identify the top car makes and states based on the number of cars sold. Similar process as above.
# Calculate total sales by car make and state
total_make_sales <- cars2 %>%
group_by(make) %>%
summarise(total_sales = n()) %>%
arrange(desc(total_sales)) %>%
top_n(20, total_sales) %>%
pull(make)
total_state_sales <- cars2 %>%
group_by(state) %>%
summarise(total_sales = n()) %>%
arrange(desc(total_sales)) %>%
top_n(20, total_sales) %>%
pull(state)
# Filter the Data for Top 20 Car Makes and States
filtered_sales <- cars2 %>%
filter(make %in% total_make_sales & state %in% total_state_sales) %>%
group_by(state, make) %>%
summarise(count = n()) %>%
ungroup()
# Create the tooltip text
filtered_sales <- filtered_sales %>%
mutate(text = paste0("State: ", state, "\n",
"Make: ", make, "\n",
"Cars Sold: ", count))
# Create the Heatmap
p <- ggplot(filtered_sales, aes(x = state, y = make, fill = count, text = text)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_ipsum() +
labs(title = "Popularity of Top 20 Car Makes by Top 20 States",
x = "State",
y = "Car Make",
fill = "Cars Sold")
# Adding interactivity with plotly
ggplotly(p, tooltip = "text")
Answer = The visualizations show that certain car makes indeed have regional popularity. For instance, Toyota is more popular in Florida (8,713 cars sold) than in California (4,663 cars sold).
The data suggests that car preferences can vary significantly across states. To better understand what influences buyers, a deeper investigation is required. Variables such as price, year, odometer, and color should be considered.
Some trends and relations have been found during the analysis.
Transmission: Automatic transmissions are increasingly popular in all states, representing 96% of all car sales.
# Select the top 12 states by total sales
top_states <- cars2 %>%
group_by(state) %>%
summarise(total_sales = n()) %>%
top_n(12, total_sales) %>%
pull(state)
# Droping N/A transmission values
cars_f <- cars2 %>%
filter(state %in% top_states) %>%
drop_na(transmission)
# Plotting
ggplot(data = cars_f) +
geom_bar(mapping = aes(x = transmission, fill = transmission)) +
facet_wrap(~state) +
labs(title = "Car Sales by Transmission Type in Top 12 States",
x = "Transmission Type",
y = "Number of Cars Sold")
ggplot(data = cars_f) +
geom_bar(mapping = aes(y = transmission, fill = transmission)) +
labs(title = "Total car sales by transmission",
x = "Transmission Type",
y = "Number of Cars Sold") +
scale_x_continuous(labels = comma)
# Calculate the percentage of automatic transmission sales
auto_percentage <- mean(cars_f$transmission == "automatic") * 100
(auto_percentage)
## [1] 96.6728
Price - Sells: The majority of vehicles sold are priced between $1,701 and $12,00, with a 44% of the total sales.
# Calculate the percentages for each price range
quartiles2 <- cut(cars2$sellingprice,
breaks = c(1, 1700, 12200, 18300, 230000),
labels = c("$1,700", "$12,200", "$18,300", "$230,000"),
include.lowest = TRUE)
cars_prices <- cars2 %>%
mutate(price_quartile = quartiles2)
price_distribution <- cars_prices %>%
group_by(price_quartile) %>%
summarise(count = n()) %>%
mutate(percentage = round((count / sum(count)) * 100, 1))
# Create the plot
ggplot(price_distribution, aes(x = 1, y = percentage, fill = price_quartile)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(percentage, "%")), position = position_stack(vjust = 0.5), color = "white") +
labs(title = "Distribution of Car Sales by Price Range",
x = "Range of car price",
y = "Percentage of Total Sales") +
scale_fill_brewer(palette = "Set2")
# Calculate the percentage of sales within the price range $1,701 to $12,200, to verify.
price_percentage <- mean(cars2$sellingprice >= 1701 & cars2$sellingprice <= 12200) * 100
price_percentage
## [1] 44.43508